# -*- coding: utf-8 -*-
"""
Created on Wed Feb 16 19:36:21 2022

@author: 22155
"""
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow.compat.v1 as tf

import os
import pickle


# 数据预处理
def load_data():
    """
    数据预处理函数
    """
    # 处理 users.txt
    users_title = ['UserID', 'Gender', 'Age', 'OccupationID', 'Area', 'FT']
    users = pd.read_table('./data/users.txt', sep=':',header=None, names=users_title, engine = 'python')
    # 改变数据中的年龄
    age_map = {val:ii for ii,val in enumerate(set(users['Age']))}
    users['Age'] = users['Age'].map(age_map)
    #将所喜爱景点类型转成数字列表，长度是18
    ft_map = {val:[int(row) for row in val.split('|')] for ii,val in enumerate(set(users['FT']))}
    for key in ft_map:
        for cnt in range(18 - len(ft_map[key])):
            ft_map[key].insert(len(ft_map[key]) + cnt,0)
    users['FT'] = users['FT'].map(ft_map)

    # 处理 scenarys.txt
    scenarys_title = ['scenaryID', 'Genres']
    scenarys = pd.read_table('./data/scenarys.txt', sep=':', encoding="latin1",header=None, names=scenarys_title, engine = 'python')   
    # 将景点类型转成等长数字列表，长度是18
    genres_map = {val:[int(row) for row in val.split('|')] for ii,val in enumerate(set(scenarys['Genres']))}
    for key in genres_map:
        for cnt in range(18 - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key]) + cnt,0)
    scenarys['Genres'] = scenarys['Genres'].map(genres_map)
   
    # 处理 ratings.txt
    ratings_title = ['UserID','scenaryID', 'ratings']
    ratings = pd.read_table('./data/ratings.txt', sep=':', header=None, names=ratings_title, engine = 'python')

    # 合并三个表
    data = pd.merge(pd.merge(ratings, users), scenarys)
    
    #将数据分成X和y两张表
    target_fields = ['ratings']
    features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]
    
    features = features_pd.values
    targets_values = targets_pd.values
    
    return features, targets_values, ratings, users, scenarys, data


# 调用数据处理函数
features, targets_values, ratings, users, scenarys, data = load_data()



#嵌入矩阵的维度
embed_dim = 32
#用户ID个数
uid_max = max(features.take(0,1)) + 1 
#性别个数
gender_max = max(features.take(2,1)) + 1 
#年龄类别个数
age_max = max(features.take(3,1)) + 1 
#职业个数
job_max = max(features.take(4,1)) + 1
#地区个数
area_max = max(features.take(5,1)) + 1
#喜爱类型
ft_max = 18 + 1

#景点ID个数
scenary_id_max = max(features.take(1,1)) + 1 
#景点类型个数
scenary_categories_max = 18 + 1 



def get_inputs():
    '''
    输入占位符
    '''
    # 用户数据输入
    user_gender = tf.placeholder(tf.int32, [None, 1], name="user_gender")
    user_age = tf.placeholder(tf.int32, [None, 1], name="user_age")
    user_job = tf.placeholder(tf.int32, [None, 1], name="user_job")
    user_area = tf.placeholder(tf.int32, [None, 1], name="user_area")    
    user_ft = tf.placeholder(tf.int32, [None, 18], name="user_ft") 
    # 景点数据输入
    scenary_id = tf.placeholder(tf.int32, [None, 1], name="scenary_id")
    scenary_categories = tf.placeholder(tf.int32, [None, 18], name="scenary_categories")
    # 目标评分
    targets = tf.placeholder(tf.int32, [None, 1], name="targets")
    # 学习率
    LearningRate = tf.placeholder(tf.float32, name = "LearningRate")
    # 弃用率
    dropout_keep_prob = tf.placeholder(tf.float32, name = "dropout_keep_prob")
    return user_gender, user_age, user_job, user_area, user_ft, scenary_id, scenary_categories, targets, LearningRate, dropout_keep_prob 


def get_user_embedding(user_gender, user_age, user_job, user_area, user_ft):
    '''
    定义User的嵌入矩阵
    '''
    with tf.name_scope("user_embedding"):
        # 用户性别嵌入矩阵
        gender_embed_matrix = tf.Variable(tf.random_uniform([gender_max, embed_dim // 2], -1, 1), name= "gender_embed_matrix")
        gender_embed_layer = tf.nn.embedding_lookup(gender_embed_matrix, user_gender, name = "gender_embed_layer")
        # 用户年龄嵌入矩阵
        age_embed_matrix = tf.Variable(tf.random_uniform([age_max, embed_dim // 2], -1, 1), name="age_embed_matrix")
        age_embed_layer = tf.nn.embedding_lookup(age_embed_matrix, user_age, name="age_embed_layer")
        # 用户职业嵌入矩阵
        job_embed_matrix = tf.Variable(tf.random_uniform([job_max, embed_dim // 2], -1, 1), name = "job_embed_matrix")
        job_embed_layer = tf.nn.embedding_lookup(job_embed_matrix, user_job, name = "job_embed_layer")
        # 用户地区嵌入矩阵
        area_embed_matrix = tf.Variable(tf.random_uniform([area_max, embed_dim // 2], -1, 1), name = "area_embed_matrix")
        area_embed_layer = tf.nn.embedding_lookup(area_embed_matrix, user_area, name = "area_embed_layer")
        # 喜爱类型嵌入矩阵
        ft_embed_matrix = tf.Variable(tf.random_uniform([ft_max, embed_dim], -1, 1), name = "ft_embed_matrix")
        ft_embed_layer = tf.nn.embedding_lookup(ft_embed_matrix, user_ft, name = "ft_embed_layer")
        # 向量元素相加     
        ft_embed_layer = tf.reduce_sum(ft_embed_layer, axis=1, keep_dims=True)
    return  gender_embed_layer, age_embed_layer, job_embed_layer, area_embed_layer, ft_embed_layer



def get_scenary_embedding(scenary_id,scenary_categories):
    '''
    定义scenary的嵌入矩阵
    '''
    with tf.name_scope("scenary_embedding"):
        # 景点ID嵌入矩阵
        scenary_id_embed_matrix = tf.Variable(tf.random_uniform([scenary_id_max, embed_dim], -1, 1), name = "scenary_id_embed_matrix")
        scenary_id_embed_layer = tf.nn.embedding_lookup(scenary_id_embed_matrix, scenary_id, name = "scenary_id_embed_layer")
        # 景点类型嵌入矩阵
        scenary_categories_embed_matrix = tf.Variable(tf.random_uniform([scenary_categories_max, embed_dim], -1, 1), name = "scenary_categories_embed_matrix")
        scenary_categories_embed_layer = tf.nn.embedding_lookup(scenary_categories_embed_matrix, scenary_categories, name = "scenary_categories_embed_layer")
        # 向量元素相加
        scenary_categories_embed_layer = tf.reduce_sum(scenary_categories_embed_layer, axis=1, keep_dims=True)
 
    return scenary_id_embed_layer,scenary_categories_embed_layer


def get_user_feature_layer(gender_embed_layer, age_embed_layer, job_embed_layer,area_embed_layer, ft_embed_layer):
    '''
    用户嵌入层向量全连接
    '''
    with tf.name_scope("user_fc"):
        #第一层全连接
        gender_fc_layer = tf.layers.dense(gender_embed_layer, embed_dim, name = "gender_fc_layer", activation=tf.nn.relu)
        age_fc_layer = tf.layers.dense(age_embed_layer, embed_dim, name ="age_fc_layer", activation=tf.nn.relu)
        job_fc_layer = tf.layers.dense(job_embed_layer, embed_dim, name = "job_fc_layer", activation=tf.nn.relu)
        area_fc_layer = tf.layers.dense(area_embed_layer, embed_dim, name = "area_fc_layer", activation=tf.nn.relu)
        ft_fc_layer = tf.layers.dense(ft_embed_layer, embed_dim, name = "ft_fc_layer", activation=tf.nn.relu)
        
        #第二层全连接
        user_combine_layer = tf.concat([ gender_fc_layer, age_fc_layer, job_fc_layer, area_fc_layer, ft_fc_layer], 2)  
        user_combine_layer = tf.layers.dense(user_combine_layer, 200, tf.tanh)  
    
        user_combine_layer_flat = tf.reshape(user_combine_layer, [-1, 200])
    return user_combine_layer, user_combine_layer_flat


def get_scenary_feature_layer(scenary_id_embed_layer, scenary_categories_embed_layer):
    '''
    景点嵌入层向量全连接
    '''
    with tf.name_scope("scenary_fc"):
        #第一层全连接
        scenary_id_fc_layer = tf.layers.dense(scenary_id_embed_layer, embed_dim, name = "scenary_id_fc_layer", activation=tf.nn.relu)
        scenary_categories_fc_layer = tf.layers.dense(scenary_categories_embed_layer, embed_dim, name = "scenary_categories_fc_layer", activation=tf.nn.relu)
    
        #第二层全连接
        scenary_combine_layer = tf.concat([scenary_id_fc_layer, scenary_categories_fc_layer], 2)       
        scenary_combine_layer = tf.layers.dense(scenary_combine_layer, 200, tf.tanh)  
    
        scenary_combine_layer_flat = tf.reshape(scenary_combine_layer, [-1, 200])
    return scenary_combine_layer, scenary_combine_layer_flat


tf.reset_default_graph()
train_graph = tf.Graph()
with train_graph.as_default():
    #获取输入占位符
    user_gender, user_age, user_job, user_area, user_ft, scenary_id, scenary_categories, targets, lr, dropout_keep_prob = get_inputs()
    #获取User的5个嵌入向量
    gender_embed_layer, age_embed_layer, job_embed_layer, area_embed_layer, ft_embed_layer = get_user_embedding(user_gender, user_age, user_job, user_area, user_ft)
    #得到用户特征
    user_combine_layer, user_combine_layer_flat = get_user_feature_layer(gender_embed_layer, age_embed_layer, job_embed_layer, area_embed_layer, ft_embed_layer)
    #获取景点的嵌入向量
    scenary_id_embed_layer,scenary_categories_embed_layer = get_scenary_embedding(scenary_id,scenary_categories)
    #得到景点特征
    scenary_combine_layer, scenary_combine_layer_flat = get_scenary_feature_layer(scenary_id_embed_layer, 
                                                                                scenary_categories_embed_layer)

    with tf.name_scope("inference"):
        #将用户特征和景点特征做矩阵乘法得到一个预测评分
        inference = tf.reduce_sum(user_combine_layer_flat * scenary_combine_layer_flat, axis=1)
        inference = tf.expand_dims(inference, axis=1)

    with tf.name_scope("loss"):
        # MSE损失，将计算值回归到评分
        cost = tf.losses.mean_squared_error(targets, inference )
        loss = tf.reduce_mean(cost)
    # 优化损失 
    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(lr) #反向传播
    gradients = optimizer.compute_gradients(loss)  #cost
    train_op = optimizer.apply_gradients(gradients, global_step=global_step)
    
    
# 训练迭代次数
num_epochs = 5
# 每个Batch大小
batch_size = 256
# dropout率
dropout_keep = 0.5
# 学习率
learning_rate = 0.0001
# 每 n 个batches 显示信息
show_every_n_batches = 20
# 保存路径
save_dir = './save'

def get_batches(Xs, ys, batch_size):
    for start in range(0, len(Xs), batch_size):
        end = min(start + batch_size, len(Xs))
        yield Xs[start:end], ys[start:end]
        
        
# 作图
import matplotlib.pyplot as plt
import time
import datetime
# 记录损失，用于画图
losses = {'train':[], 'test':[]}

with tf.Session(graph=train_graph) as sess:
    
    #搜集数据给tensorBoard用
    # Keep track of gradient values and sparsity
    grad_summaries = []
    for g, v in gradients:
        if g is not None:
            grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name.replace(':', '_')), g)
            sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name.replace(':', '_')), tf.nn.zero_fraction(g))
            grad_summaries.append(grad_hist_summary)
            grad_summaries.append(sparsity_summary)
    grad_summaries_merged = tf.summary.merge(grad_summaries)
        
    # 输出文件夹
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
     
    # 损失与精度的 Summaries
    loss_summary = tf.summary.scalar("loss", loss)

    # 训练 Summaries
    train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged])
    train_summary_dir = os.path.join(out_dir, "summaries", "train")
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

    # 测试 summaries
    inference_summary_op = tf.summary.merge([loss_summary])
    inference_summary_dir = os.path.join(out_dir, "summaries", "inference")
    inference_summary_writer = tf.summary.FileWriter(inference_summary_dir, sess.graph)

    # 变量初始化
    sess.run(tf.global_variables_initializer())
    # 模型保存
    saver = tf.train.Saver()
    for epoch_i in range(num_epochs):
        
        #将数据集分成训练集和测试集，随机种子不固定
        train_X,test_X, train_y, test_y = train_test_split(features,  
                                                           targets_values,  
                                                           test_size = 0.2,  
                                                           random_state = 0)  
        # 分开batches
        train_batches = get_batches(train_X, train_y, batch_size)
        test_batches = get_batches(test_X, test_y, batch_size)
    
        #训练的迭代，保存训练损失
        for batch_i in range(len(train_X) // batch_size):
            x, y = next(train_batches)

            categories = np.zeros([batch_size, 18])
            for i in range(batch_size):
                categories[i] = x.take(7,1)[i]
                
            typies = np.zeros([batch_size, 18])
            for i in range(batch_size):
                typies[i] = x.take(6,1)[i]


            # 传入数据
            feed = {
                user_gender: np.reshape(x.take(2,1), [batch_size, 1]),
                user_age: np.reshape(x.take(3,1), [batch_size, 1]),
                user_job: np.reshape(x.take(4,1), [batch_size, 1]),
                user_area: np.reshape(x.take(5,1),[batch_size,1]),
                user_ft: typies,  #x.take(6,1)
                scenary_id: np.reshape(x.take(1,1), [batch_size, 1]),
                scenary_categories: categories,  #x.take(7,1)
                targets: np.reshape(y, [batch_size, 1]),
                dropout_keep_prob: dropout_keep, #dropout_keep
                lr: learning_rate}

            # 计算结果
            step, train_loss, summaries, _ = sess.run([global_step, loss, train_summary_op, train_op], feed)  #cost
            losses['train'].append(train_loss)
            # 保存记录
            train_summary_writer.add_summary(summaries, step)
            
            # 每多少个 batches 显示一次
            if (epoch_i * (len(train_X) // batch_size) + batch_i) % show_every_n_batches == 0:
                time_str = datetime.datetime.now().isoformat()
                print('{}: Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    time_str,
                    epoch_i,
                    batch_i,
                    (len(train_X) // batch_size),
                    train_loss))
                
        #使用测试数据的迭代
        for batch_i  in range(len(test_X) // batch_size):
            x, y = next(test_batches)
            
            categories = np.zeros([batch_size, 18])
            for i in range(batch_size):
                categories[i] = x.take(7,1)[i]
                
                
            typies = np.zeros([batch_size, 18])
            for i in range(batch_size):
                typies[i] = x.take(6,1)[i]
                
            # 传入数据
            feed = {               
                user_gender: np.reshape(x.take(2,1), [batch_size, 1]),
                user_age: np.reshape(x.take(3,1), [batch_size, 1]),
                user_job: np.reshape(x.take(4,1), [batch_size, 1]),
                user_area: np.reshape(x.take(5,1),[batch_size,1]),
                user_ft: typies,  #x.take(6,1)
                scenary_id: np.reshape(x.take(1,1), [batch_size, 1]),
                scenary_categories: categories,  #x.take(7,1)
                targets: np.reshape(y, [batch_size, 1]),
                dropout_keep_prob: 1,
                lr: learning_rate}
            
            # 计算结果
            step, test_loss, summaries = sess.run([global_step, loss, inference_summary_op], feed)  #cost

            #保存测试损失
            losses['test'].append(test_loss)
            inference_summary_writer.add_summary(summaries, step)  

            # 每多少个 batches 显示一次
            time_str = datetime.datetime.now().isoformat()
            if (epoch_i * (len(test_X) // batch_size) + batch_i) % show_every_n_batches == 0:
                print('{}: Epoch {:>3} Batch {:>4}/{}   test_loss = {:.3f}'.format(
                    time_str,
                    epoch_i,
                    batch_i,
                    (len(test_X) // batch_size),
                    test_loss))

    # 保存模型
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

    
# 保存参数到文件中
pickle.dump((save_dir), open('params.p', 'wb'))


import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']

plt.figure(figsize=(8,6))
plt.plot(losses['train'], label='训练损失')
plt.legend()
plt.xlabel("批次")
plt.ylabel("损失")
_ = plt.ylim()

plt.figure(figsize=(8,6))
plt.plot(losses['test'], label='测试损失')
plt.legend()
plt.xlabel("批次")
plt.ylabel("损失")
_ = plt.ylim()




# 从文件中加载参数
load_dir = pickle.load(open('params.p', mode='rb'))

def get_tensors(loaded_graph):
    '''
    使用 get_tensor_by_name() 函数从 loaded_graph 中获取tensors
    '''
    user_gender = loaded_graph.get_tensor_by_name("user_gender:0")
    user_age = loaded_graph.get_tensor_by_name("user_age:0")
    user_job = loaded_graph.get_tensor_by_name("user_job:0")
    user_area = loaded_graph.get_tensor_by_name("user_area:0")
    user_ft = loaded_graph.get_tensor_by_name("user_ft:0")
    
    scenary_id = loaded_graph.get_tensor_by_name("scenary_id:0")
    scenary_categories = loaded_graph.get_tensor_by_name("scenary_categories:0")
    targets = loaded_graph.get_tensor_by_name("targets:0")
    dropout_keep_prob = loaded_graph.get_tensor_by_name("dropout_keep_prob:0")
    lr = loaded_graph.get_tensor_by_name("LearningRate:0")
    inference = loaded_graph.get_tensor_by_name("inference/ExpandDims:0")
    scenary_combine_layer_flat = loaded_graph.get_tensor_by_name("scenary_fc/Reshape:0")
    user_combine_layer_flat = loaded_graph.get_tensor_by_name("user_fc/Reshape:0")
    return user_gender, user_age, user_job, user_area, user_ft, scenary_id, scenary_categories, targets, lr, dropout_keep_prob, inference, scenary_combine_layer_flat, user_combine_layer_flat


loaded_graph = tf.Graph()
scenary_matrics = []
with tf.Session(graph=loaded_graph) as sess:
    # 载入保存好的模型
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # 调用函数拿到 tensors
    user_gender, user_age, user_job, user_area,user_ft,scenary_id, scenary_categories, targets, lr, dropout_keep_prob, _, scenary_combine_layer_flat, __ = get_tensors(loaded_graph)  #loaded_graph
    
    for item in scenarys.values:
        categories = np.zeros([1, 18])
        categories[0] = item.take(1)


        feed = {
            scenary_id: np.reshape(item.take(0), [1, 1]),
            scenary_categories: categories,
            dropout_keep_prob: 1}

        scenary_combine_layer_flat_val = sess.run([scenary_combine_layer_flat], feed)
        # 添加进一个list中
        scenary_matrics.append(scenary_combine_layer_flat_val)
        
# 保存成scenary_matrics.p文件
pickle.dump((np.array(scenary_matrics).reshape(-1, 200)), open('scenary_matrics.p', 'wb'))



